{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[torchao](https://github.com/pytorch/ao) is a PyTorch architecture optimization library with support for custom high performance data types, quantization, and sparsity. It is composable with native PyTorch features such as [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training.\n",
    "\n",
    "To quantize a model, you need to install torchao and follow the examples below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "!pip install torchao"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If the execution runtime is GPU, the code will quantize the model on the GPU with `device_map=\"auto\"`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer\n",
    "from torchao.quantization import Int8WeightOnlyConfig\n",
    "\n",
    "quant_config = Int8WeightOnlyConfig(group_size=128)\n",
    "quantization_config = TorchAoConfig(quant_type=quant_config)\n",
    "\n",
    "quantized_model = AutoModelForCausalLM.from_pretrained(\n",
    "    \"unsloth/Llama-3.2-1B-Instruct\",\n",
    "    torch_dtype=\"auto\",\n",
    "    device_map=\"auto\",\n",
    "    quantization_config=quantization_config\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"unsloth/Llama-3.2-1B-Instruct\")\n",
    "input_text = \"What are we having for dinner?\"\n",
    "input_ids = tokenizer(input_text, return_tensors=\"pt\").to(\"cuda\")\n",
    "\n",
    "output = quantized_model.generate(**input_ids, max_new_tokens=10)\n",
    "print(tokenizer.decode(output[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The example below will quantize the model on the CPU with `device_map=\"cpu\"`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer\n",
    "from torchao.quantization import Int8WeightOnlyConfig\n",
    "\n",
    "quant_config = Int8WeightOnlyConfig(group_size=128)\n",
    "quantization_config = TorchAoConfig(quant_type=quant_config)\n",
    "\n",
    "quantized_model = AutoModelForCausalLM.from_pretrained(\n",
    "    \"unsloth/Llama-3.2-1B-Instruct\",\n",
    "    torch_dtype=\"auto\",\n",
    "    device_map=\"cpu\",\n",
    "    quantization_config=quantization_config\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"unsloth/Llama-3.2-1B-Instruct\")\n",
    "input_text = \"What are we having for dinner?\"\n",
    "input_ids = tokenizer(input_text, return_tensors=\"pt\")\n",
    "\n",
    "output = quantized_model.generate(**input_ids, max_new_tokens=10)\n",
    "print(tokenizer.decode(output[0], skip_special_tokens=True))\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
